import csv
import os

# Define the categories and their folders with ID prefixes
categories = {
    "biographies": {"name": "Biography", "prefix": "b"},
    "historical-events": {"name": "Historical Event", "prefix": "h"},
    "world-events": {"name": "World News Event", "prefix": "w"},
    "scientific-discoveries": {"name": "Scientific Discovery", "prefix": "d"},
}


# Function to extract the title from a filename
def extract_title(filename):
    title = os.path.splitext(filename)[0]
    # Replace underscores with spaces
    title = title.replace("_", " ")
    return title


# Create CSV file
csv_file = "supernova_dataset.csv"
with open(csv_file, "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file, quoting=csv.QUOTE_ALL)
    # Write header with ID field
    writer.writerow(["ID", "Category", "Title", "Content"])

    # Use a simple counter for IDs
    counter = 1

    # Process each category folder
    for folder, category_info in categories.items():
        # Reset counter for each category
        category_counter = 1

        for filename in os.listdir(folder):
            if filename.endswith(".txt"):
                file_path = os.path.join(folder, filename)
                title = extract_title(filename)

                # Create prefixed ID (e.g., b001, d001, etc.)
                prefixed_id = f"{category_info['prefix']}{category_counter:03d}"
                category_counter += 1
                counter += 1

                # Read the content of the file
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        content = f.read().strip()

                    # Write to CSV with prefixed ID
                    writer.writerow([prefixed_id, category_info['name'], title, content])
                    print(f"Processed: {title}")
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

print(f"CSV file created: {csv_file}")
